3  ETL

Code
import os
from pandas.plotting import autocorrelation_plot
import warnings
import pandas as pd
import seaborn as sns
import requests
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly.graph_objects as go
from IPython.display import display
from scipy.stats import ks_2samp
# from TSPackages import *
from scipy.stats import jarque_bera
from scipy.stats import ks_2samp
from scipy.stats import kurtosis, skew
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
warnings.filterwarnings('ignore')
from pandas.plotting import autocorrelation_plot
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import os
Code
file_path = [
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2019_A_31-12-2019.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2020_A_31-12-2020.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2021_A_31-12-2021.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2022_A_31-12-2022.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2023_A_31-12-2023.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2024_A_31-12-2024.CSV",
    r"C:\Users\Keyla Alba\OneDrive - Universidad del Norte\Doctorado (Ciencias)\Move La America\EDA\Datos\INMET_CO_MS_A710_PARANAIBA_01-01-2025_A_28-02-2025.CSV"
]

def load_and_clean(file_path):
    df = pd.read_csv(file_path, sep=';', encoding='latin1', skiprows=8)
    df.columns = [col.strip().upper() for col in df.columns]
    for col in df.columns:
        if "RADIACAO" in col and "KJ" in col:
            df.rename(columns={col: "RADIACAO_GLOBAL"}, inplace=True)
            break
    df['HORA UTC'] = df['HORA UTC'].astype(str).str.replace(' UTC', '', regex=False)
    df['HORA UTC'] = df['HORA UTC'].str.zfill(4)
    df['HORA UTC'] = df['HORA UTC'].str[:2] + ':' + df['HORA UTC'].str[2:4]
    df['datetime'] = pd.to_datetime(df['DATA'] + ' ' + df['HORA UTC'], format='%Y/%m/%d %H:%M', errors='coerce')
    df['RADIACAO_GLOBAL'] = pd.to_numeric(df['RADIACAO_GLOBAL'], errors='coerce').fillna(0)
    return df[['datetime', 'RADIACAO_GLOBAL']]
df_INMET = pd.concat([load_and_clean(fp) for fp in file_path], ignore_index=True)
print(df_INMET.shape)
df_INMET.head(n=14)
(54024, 2)
datetime RADIACAO_GLOBAL
0 2019-01-01 00:00:00 0.0
1 2019-01-01 01:00:00 0.0
2 2019-01-01 02:00:00 0.0
3 2019-01-01 03:00:00 0.0
4 2019-01-01 04:00:00 0.0
5 2019-01-01 05:00:00 0.0
6 2019-01-01 06:00:00 0.0
7 2019-01-01 07:00:00 0.0
8 2019-01-01 08:00:00 0.0
9 2019-01-01 09:00:00 0.0
10 2019-01-01 10:00:00 0.0
11 2019-01-01 11:00:00 0.0
12 2019-01-01 12:00:00 0.0
13 2019-01-01 13:00:00 2434.0
Code
fig_INMET = px.line(
    df_INMET,
    x='datetime',
    y='RADIACAO_GLOBAL',
    title='Radiación Global (KJ/m²) por Hora INMET (2019 a Febrero 2025)',
    labels={
        'datetime': 'Fecha y Hora',
        'RADIACAO_GLOBAL': 'Radiación (KJ/m²)'
    }
)
fig_INMET.update_layout(template='plotly_white', width=1100, height=500)
fig_INMET.show()

3.0.0.1 Imputación usando promedio estacional

Uno de los retos fundamentales en el análisis de series temporales de radiación solar es la presencia de valores faltantes, especialmente en estaciones meteorológicas con limitaciones técnicas o períodos de inestabilidad climática. Una alternativa eficaz para abordar este problema es la Regla de Imputación Estacional, la cual consiste en reemplazar los datos perdidos utilizando el promedio de los valores registrados en la misma hora y día de otros años o meses, respetando así la estacionalidad y patrones cíclicos propios de la variable. Esta técnica resulta particularmente adecuada para variables como la radiación solar, que presenta una fuerte dependencia temporal y comportamiento periódico. En este contexto, analizaron el rendimiento de métodos univariantes de imputación bajo diferentes condiciones climáticas, concluyendo que las estrategias basadas en estacionalidad ofrecen ventajas significativas en climas tropicales. De manera complementaria, enfatizan que una adecuada imputación mejora sustancialmente la precisión de modelos de predicción basados en aprendizaje automático, al reducir la incertidumbre asociada al preprocesamiento de datos meteorológicos.

Regla de Imputación Estacional

Supongamos que tienes una serie de datos:

  • \(R_i\): valor de radiación en la observación \(i\)
  • \(d_i\): día del año de la observación \(i\) (de 1 a 366)
  • \(h_i\): hora del día de la observación \(i\) (de 0 a 23)

Para cada observación con valor faltante \(R_i = 0\), la imputación es:

\[ \hat{R}_i = \frac{1}{n_{d_i,h_i}} \sum_{j \in G(d_i,h_i)} R_j \]

Donde:

  • \(\hat{R}_i\) es el valor imputado para la observación \(i\)
  • \(G(d_i, h_i)\) es el conjunto de observaciones \(j\) donde \(R_j > 0\), y el día y la hora coinciden con \(d_i, h_i\)
  • \(n_{d_i,h_i}\) es el número de valores disponibles en ese grupo

Si no existen datos en ese grupo:

\[ \hat{R}_i = 0 \]

Code
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET['RADIACAO_GLOBAL'].replace(0, pd.NA)
df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
df_INMET['dayofyear'] = df_INMET['datetime'].dt.dayofyear
df_INMET['hour'] = df_INMET['datetime'].dt.hour

tabla_promedios = df_INMET.groupby(['dayofyear', 'hour'])['RADIACAO_GLOBAL_IMPUTADA'].mean()
def imputar_estacional(row):
    if pd.isna(row['RADIACAO_GLOBAL_IMPUTADA']):
        return tabla_promedios.get((int(row['dayofyear']), int(row['hour'])), 0)
    else:
        return row['RADIACAO_GLOBAL_IMPUTADA']
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET.apply(imputar_estacional, axis=1)
df_INMET['RADIACAO_GLOBAL_IMPUTADA'] = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].fillna(0)
print(df_INMET.shape)
df_INMET.tail(20)
(54024, 5)
datetime RADIACAO_GLOBAL RADIACAO_GLOBAL_IMPUTADA dayofyear hour
54004 2025-02-28 04:00:00 0.0 0.0 59 4
54005 2025-02-28 05:00:00 0.0 0.0 59 5
54006 2025-02-28 06:00:00 0.0 0.0 59 6
54007 2025-02-28 07:00:00 0.0 0.0 59 7
54008 2025-02-28 08:00:00 0.0 0.0 59 8
54009 2025-02-28 09:00:00 0.0 0.0 59 9
54010 2025-02-28 10:00:00 0.0 0.0 59 10
54011 2025-02-28 11:00:00 0.0 0.0 59 11
54012 2025-02-28 12:00:00 0.0 1749.0 59 12
54013 2025-02-28 13:00:00 0.0 0.0 59 13
54014 2025-02-28 14:00:00 0.0 2989.0 59 14
54015 2025-02-28 15:00:00 0.0 0.0 59 15
54016 2025-02-28 16:00:00 0.0 1930.0 59 16
54017 2025-02-28 17:00:00 0.0 0.0 59 17
54018 2025-02-28 18:00:00 0.0 3146.0 59 18
54019 2025-02-28 19:00:00 0.0 0.0 59 19
54020 2025-02-28 20:00:00 0.0 0.0 59 20
54021 2025-02-28 21:00:00 0.0 0.0 59 21
54022 2025-02-28 22:00:00 0.0 97.0 59 22
54023 2025-02-28 23:00:00 0.0 0.0 59 23
Code
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_INMET['datetime'],
    y=df_INMET['RADIACAO_GLOBAL_IMPUTADA'],
    mode='lines',
    name='INMET-IMPUTADA',
    line=dict(color='orange'),
    opacity=0.7
))
fig.add_trace(go.Scatter(
    x=df_INMET['datetime'],
    y=df_INMET['RADIACAO_GLOBAL'],
    mode='lines',
    name='INMET-SIN IMPUTAR',
    line=dict(color='blue'),
    opacity=0.7
))
fig.update_layout(
    title='Evolución de Radiación INMET - Imputación Promedio Estacional (2019-2025)',
    xaxis_title='Fecha y Hora',
    yaxis_title='Radiación (KJ/m²)',
    legend_title='Fuente',
    template='plotly_white',
    height=500,
    width=1100
)
fig.show()
Code
rad_orig = df_INMET['RADIACAO_GLOBAL'][df_INMET['RADIACAO_GLOBAL'] > 0]
rad_imp = df_INMET['RADIACAO_GLOBAL_IMPUTADA'][df_INMET['RADIACAO_GLOBAL_IMPUTADA'] > 0]

fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True)

sns.histplot(rad_orig, bins=60, kde=True, color='royalblue', ax=axes[0])
axes[0].set_title('RADIACAO_GLOBAL (> 0)')
axes[0].set_xlabel('Radiación (KJ/m²)')
axes[0].set_ylabel('Frecuencia')
axes[0].grid(True)

sns.histplot(rad_imp, bins=60, kde=True, color='darkorange', ax=axes[1])
axes[1].set_title('RADIACAO_GLOBAL_IMPUTADA (> 0)')
axes[1].set_xlabel('Radiación (KJ/m²)')
axes[1].set_ylabel('')
axes[1].grid(True)

fig.suptitle('Distribuciones de Radiación Solar INMET (Imputación Promedio Estacional) (> 0) - Antes y Después ', fontsize=16)
plt.tight_layout()
plt.show()

rad_orig = df_INMET['RADIACAO_GLOBAL'][df_INMET['RADIACAO_GLOBAL'] > 0]
rad_imp = df_INMET['RADIACAO_GLOBAL_IMPUTADA'][df_INMET['RADIACAO_GLOBAL_IMPUTADA'] > 0]
stat_ks, p_ks = ks_2samp(rad_orig, rad_imp)
print("Kolmogorov-Smirnov Test (Distribución Original vs Imputada):")
print(f"Estadístico KS: {stat_ks:.4f}")
print(f"p-valor: {p_ks:.4e}")
if p_ks > 0.05:
    print("No se rechaza H0 → Las distribuciones son estadísticamente similares.")
else:
    print("Se rechaza H0 → Las distribuciones son diferentes.")

Kolmogorov-Smirnov Test (Distribución Original vs Imputada):
Estadístico KS: 0.0153
p-valor: 8.3915e-01
No se rechaza H0 → Las distribuciones son estadísticamente similares.

3.0.0.2 EDA

En el contexto de la predicción de radiación solar mediante modelos de series de tiempo, es fundamental conservar la estructura completa de la serie, incluyendo los valores de radiación nocturna igual a cero. El mantenimiento de esta continuidad temporal asegura que los algoritmos puedan captar correctamente la dinámica estacional y los ciclos diarios, evitando la introducción de sesgos por omisión de periodos sin registros. Este enfoque es especialmente relevante al trabajar con modelos como redes neuronales recurrentes, regresión de soporte vectorial o procesos gaussianos, los cuales requieren secuencias homogéneas y completas para una convergencia adecuada y generalización efectiva .

Adicionalmente, investigaciones que comparan arquitecturas híbridas de redes neuronales han resaltado que la inclusión de todos los datos horarios incluyendo los correspondientes a la noche contribuye a un entrenamiento más robusto y coherente, permitiendo una mejor adaptación a diferentes contextos geográficos y meteorológicos .

3.0.0.2.1 Horas
Code
df_INMET.to_csv('df_INMET.csv', index=False)
Code
df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
fig = px.line(df_INMET, 
              x='datetime', 
              y='RADIACAO_GLOBAL_IMPUTADA',
              title='Time Series - Global Radiation',
              labels={'datetime': 'Date', 'RADIACAO_GLOBAL_IMPUTADA': 'Radiation'})

fig.update_layout(
    xaxis_title='Fecha',
    yaxis_title='Radiación',
    template='plotly_white',
    width=1000,
    height=500
)
fig.show()
Code
df_INMET['datetime'] = pd.to_datetime(df_INMET['datetime'])
df_INMET = df_INMET.set_index('datetime')
os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(14, 5))
plt.plot(df_INMET['RADIACAO_GLOBAL_IMPUTADA'], color='blue')
plt.title('Time Series - Global Radiation', fontsize=18)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Radiation (W/m²)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Global_Radiation_TimeSeries.png", dpi=300)
plt.show()

Code
import pandas as pd
import matplotlib.pyplot as plt
import os

año_inicio = 2019
fechas_completas = pd.date_range(start=f"{año_inicio}-01-01 00:00:00", periods=len(df_INMET), freq='H')
df_INMET['datetime'] = fechas_completas
df_INMET = df_INMET.set_index('datetime')
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(df_INMET.index, df_INMET['RADIACAO_GLOBAL_IMPUTADA'], color='blue')
ax.set_title('Time Series - Global Radiation', fontsize=18)
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Radiation (W/m²)', fontsize=14)
ax.grid(True)
interval_start = pd.Timestamp("2020-01-10 00:00:00")
interval_end = pd.Timestamp("2020-01-15 00:00:00")
zoom_data = df_INMET.loc[interval_start:interval_end]
ax_inset = fig.add_axes([0.6, 0.45, 0.25, 0.4])  # [left, bottom, width, height]
ax_inset.plot(zoom_data.index, zoom_data['RADIACAO_GLOBAL_IMPUTADA'], color='red')
ax_inset.set_title('Zoom: Jan 10–15, 2020', fontsize=10)
ax_inset.grid(True)
ax_inset.set_xticks([])
ax_inset.set_yticks([])
ax_inset.set_xlabel('')
ax_inset.set_ylabel('')
os.makedirs("figures", exist_ok=True)
plt.tight_layout()
plt.savefig("figures/Global_Radiation_TimeSeries_Zoom.png", dpi=300)
plt.show()

Code
variables = ['RADIACAO_GLOBAL_IMPUTADA']
resumen = {}
for var in variables:
    data = df_INMET[var]  
    resumen[var] = {
        'N_records': len(data),
        'μ': data.mean(),
        'σ': data.std(),
        'y_min': data.min(),
        'Q1': data.quantile(0.25),
        'x̄': data.median(),
        'Q3': data.quantile(0.75),
        'y_max': data.max(),
        'Kurtosis': data.kurtosis(),
        'Skewness': data.skew()
    }
df_resumen = pd.DataFrame(resumen)
df_resumen
RADIACAO_GLOBAL_IMPUTADA
N_records 54024.000000
μ 284.327790
σ 752.818778
y_min 0.000000
Q1 0.000000
0.000000
Q3 0.000000
y_max 3924.000000
Kurtosis 6.500850
Skewness 2.742065
Code
from statsmodels.tsa.seasonal import seasonal_decompose
descomposicion = seasonal_decompose(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(1000), model='additive', period=24)
descomposicion.plot()
plt.suptitle("Decomposition of Global Radiation (Daily Cycle)", fontsize=16)
plt.tight_layout()
plt.show()

Code
desc_por_hora = df_INMET.groupby('hour')['RADIACAO_GLOBAL_IMPUTADA'].describe()
desc_por_hora
count mean std min 25% 50% 75% max
hour
0 2251.0 0.021324 0.291339 0.0 0.0 0.0 0.0 4.0
1 2251.0 0.024434 0.287252 0.0 0.0 0.0 0.0 5.0
2 2251.0 0.009329 0.167072 0.0 0.0 0.0 0.0 3.0
3 2251.0 0.032430 0.292600 0.0 0.0 0.0 0.0 4.0
4 2251.0 0.015549 0.184338 0.0 0.0 0.0 0.0 3.0
5 2251.0 0.011550 0.132832 0.0 0.0 0.0 0.0 2.0
6 2251.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0
7 2251.0 0.005775 0.075792 0.0 0.0 0.0 0.0 1.0
8 2251.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0
9 2251.0 0.822301 4.227157 0.0 0.0 0.0 0.0 51.0
10 2251.0 32.930920 96.786907 0.0 0.0 0.0 0.0 666.0
11 2251.0 185.690582 346.654214 0.0 0.0 0.0 262.0 1656.0
12 2251.0 443.797201 687.757224 0.0 0.0 0.0 913.0 2524.0
13 2251.0 593.693470 917.315057 0.0 0.0 0.0 1498.0 3033.0
14 2251.0 901.893159 1242.582789 0.0 0.0 0.0 2175.0 3425.0
15 2251.0 935.042870 1337.877727 0.0 0.0 0.0 2449.0 3793.0
16 2251.0 1129.770546 1418.127255 0.0 0.0 0.0 2611.0 3924.0
17 2251.0 834.166148 1219.894113 0.0 0.0 0.0 2212.0 3782.0
18 2251.0 775.424256 1051.873855 0.0 0.0 0.0 1827.0 3568.0
19 2251.0 499.856952 767.244822 0.0 0.0 0.0 1114.0 2760.0
20 2251.0 344.708796 527.554926 0.0 0.0 0.0 737.0 1999.0
21 2251.0 127.717015 245.809561 0.0 0.0 0.0 185.0 1276.0
22 2251.0 18.125722 60.460133 0.0 0.0 0.0 1.0 639.0
23 2251.0 0.106619 0.705033 0.0 0.0 0.0 0.0 9.0
Code
descomposicion = seasonal_decompose(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(1000), model='additive', period=24)
fig = descomposicion.plot()
fig.suptitle("Decomposition of Global Radiation (Daily Cycle)", fontsize=16)
plt.tight_layout()
fig.savefig('figures/decomposition_global_radiation.png')
plt.show()

Code
os.makedirs('figures', exist_ok=True)
media_por_hora = df_INMET.groupby('hour')['RADIACAO_GLOBAL_IMPUTADA'].mean().reset_index()
plt.figure(figsize=(8, 6))
sns.lineplot(data=media_por_hora, x='hour', y='RADIACAO_GLOBAL_IMPUTADA', marker='o')
plt.title('Average Global Radiation by Hour', fontsize=16)
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Global Radiation', fontsize=12)
plt.xticks(range(0, 24))
plt.grid(True)
plt.savefig('figures/avg_radiation_by_hour.png')
plt.show()

Code
os.makedirs('figures', exist_ok=True)

if 'hour' not in df_INMET.columns:
    df_INMET['hour'] = df_INMET.index.hour
fig = px.box(
    df_INMET,
    x='hour',
    y='RADIACAO_GLOBAL_IMPUTADA',
    labels={
        'hour': 'Hour of Day',
        'RADIACAO_GLOBAL_IMPUTADA': 'Global Radiation'
    },
    title='Distribution of Global Radiation by Hour of the Day'
)

fig.update_layout(
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14
)
fig.write_html("figures/boxplot_radiation_by_hour_plotly.html")
fig.show()
Code
os.makedirs('figures', exist_ok=True)
if 'hour' not in df_INMET.columns:
    df_INMET['hour'] = df_INMET.index.hour
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_INMET, x='hour', y='RADIACAO_GLOBAL_IMPUTADA')
plt.title('Distribution of Global Radiation by Hour of the Day', fontsize=18)
plt.xlabel('Hour of Day', fontsize=14)
plt.ylabel('Global Radiation', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('figures/boxplot_radiation_by_hour.png', dpi=300)
plt.show()

Code
os.makedirs('figures', exist_ok=True)
plt.figure(figsize=(5, 4))
autocorrelation_plot(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(200))
plt.title("Autocorrelation - Global Radiation", fontsize=14)
plt.grid(True)
plt.savefig("figures/autocorrelation_global_radiation.png")
plt.show()

Code
from statsmodels.graphics.tsaplots import plot_pacf

os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(5, 4))
plot_pacf(df_INMET['RADIACAO_GLOBAL_IMPUTADA'].tail(2000), lags=100, method='ywm')
plt.title("PACF - Global Radiation (Hourly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/pacf_hourly_compact.png")
plt.show()
<Figure size 500x400 with 0 Axes>

3.0.0.3 EDA DIARIA

Code
df_diario = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean()
variables = ['RADIACAO_GLOBAL_IMPUTADA']
resumen = {}

for var in variables:
    data = df_diario  
    resumen[var] = {
        'N_records': len(data),
        'μ': data.mean(),
        'σ': data.std(),
        'y_min': data.min(),
        'Q1': data.quantile(0.25),
        'x̄': data.median(),
        'Q3': data.quantile(0.75),
        'y_max': data.max(),
        'Kurtosis': data.kurtosis(),
        'Skewness': data.skew()
    }
df_resumen = pd.DataFrame(resumen)
df_resumen
RADIACAO_GLOBAL_IMPUTADA
N_records 2251.000000
μ 284.327790
σ 142.219504
y_min 0.000000
Q1 187.916667
263.104167
Q3 369.541667
y_max 799.916667
Kurtosis 0.287621
Skewness 0.546791
Code
os.makedirs("figures", exist_ok=True)
interval_start = "2022-01-01"
interval_end = "2022-03-01"
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(df_diario, color='blue')
ax.set_title('Daily Time Series - Global Radiation', fontsize=18)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Global Radiation', fontsize=12)
ax.grid(True)
ax.axvspan(pd.to_datetime(interval_start), pd.to_datetime(interval_end),
           color='gray', alpha=0.3, label='Zoom region')
ax_inset = fig.add_axes([0.55, 0.50, 0.3, 0.35])  # [x, y, width, height]
zoom_data = df_diario.loc[interval_start:interval_end]
ax_inset.plot(zoom_data, color='red')
ax_inset.set_title('Zoom: Jan–Mar 2022', fontsize=10)
ax_inset.set_xticks([])
ax_inset.set_yticks([])
plt.tight_layout()
plt.savefig("figures/daily_series_with_zoom_highlighted.png")
plt.show()

Code
descomposicion = seasonal_decompose(df_diario.tail(200), model='additive', period=7)
descomposicion.plot()
plt.suptitle('Descomposición de Radiación Global (ciclo semanal - 7 días)', fontsize=16)
plt.tight_layout()
plt.show()

Code
df_INMET['day_of_week'] = df_INMET.index.day_name()
desc_por_dia = df_INMET.groupby('day_of_week')['RADIACAO_GLOBAL_IMPUTADA'].describe()
orden_dias = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
desc_por_dia = desc_por_dia.loc[orden_dias]
desc_por_dia
desc_por_dia = df_INMET.groupby('day_of_week')['RADIACAO_GLOBAL_IMPUTADA'].describe()
orden_dias = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
desc_por_dia = desc_por_dia.loc[orden_dias]
desc_por_dia
count mean std min 25% 50% 75% max
day_of_week
Monday 7704.0 276.915390 740.072654 0.0 0.0 0.0 0.0 3924.0
Tuesday 7728.0 288.041990 760.135891 0.0 0.0 0.0 0.0 3924.0
Wednesday 7728.0 281.359645 747.620231 0.0 0.0 0.0 0.0 3924.0
Thursday 7728.0 284.264644 751.770750 0.0 0.0 0.0 0.0 3869.0
Friday 7728.0 285.984170 756.346878 0.0 0.0 0.0 0.0 3924.0
Saturday 7704.0 285.550299 753.603337 0.0 0.0 0.0 0.0 3924.0
Sunday 7704.0 288.171102 760.189282 0.0 0.0 0.0 0.0 3924.0
Code
os.makedirs("figures", exist_ok=True)
average_by_day = df_INMET.groupby(df_INMET.index.day_name())['RADIACAO_GLOBAL_IMPUTADA'].mean().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
).reset_index()
average_by_day.columns = ['Day', 'Average Radiation']
plt.figure(figsize=(8, 6))
sns.lineplot(data=average_by_day, x='Day', y='Average Radiation', marker='o')
plt.title('Average Global Radiation by Day of the Week', fontsize=18)
plt.xlabel('Day of the Week', fontsize=14)
plt.ylabel('Global Radiation', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Average_Radiation_by_Day.png", dpi=300)
plt.show()

Code
os.makedirs("figures", exist_ok=True)

daily_series = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean().dropna()
df_daily = daily_series.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_daily['Day'] = df_daily.index.day_name()
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
fig = px.box(
    df_daily,
    x='Day',
    y='RADIACAO_GLOBAL_IMPUTADA',
    category_orders={'Day': days_order},
    labels={
        'Day': 'Day of the Week',
        'RADIACAO_GLOBAL_IMPUTADA': 'Global Radiation (Imputed)'
    },
    title='Distribution of Imputed Global Radiation by Day of the Week'
)

fig.update_layout(
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14
)
fig.write_html("figures/Boxplot_GlobalRadiation_ByDay_Plotly.html")
fig.show()
Code
os.makedirs("figures", exist_ok=True)
daily_series = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean().dropna()
df_daily = daily_series.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_daily['Day'] = df_daily.index.day_name()
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=df_daily,
    x='Day',
    y='RADIACAO_GLOBAL_IMPUTADA',
    order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)
plt.title('Distribution of Imputed Global Radiation by Day of the Week', fontsize=18)
plt.xlabel('Day of the Week', fontsize=14)
plt.ylabel('Global Radiation (Imputed)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Boxplot_GlobalRadiation_ByDay.png", dpi=300)
plt.show()

Code
plt.figure(figsize=(6, 4))
autocorrelation_plot(daily_series.tail(200))
plt.title("Autocorrelation - Global Radiation", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Autocorrelation_GlobalRadiation_Daily_200days.png", dpi=300)
plt.show()

Code
plt.figure(figsize=(10, 4))
plot_pacf(daily_series.tail(2000), lags=150, method='ywm')  
plt.title("PACF - Global Radiation (Daily Series)", fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/PACF_GlobalRadiation_DailySeries.png", dpi=300)
plt.show()
<Figure size 1000x400 with 0 Axes>

3.0.0.3.1 EDA MENSUAL
Code
df_mensual = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('M').mean().to_frame(name='Promedio_Radiacion_Mensual')
variables = ['Promedio_Radiacion_Mensual']
resumen = {}
for var in variables:
    data = df_mensual[var]
    resumen[var] = {
        'N_records': len(data),
        'μ': data.mean(),
        'σ': data.std(),
        'y_min': data.min(),
        'Q1': data.quantile(0.25),
        'x̄': data.median(),
        'Q3': data.quantile(0.75),
        'y_max': data.max(),
        'Kurtosis': data.kurtosis(),
        'Skewness': data.skew()
    }
df_resumen = pd.DataFrame(resumen)
df_resumen
Promedio_Radiacion_Mensual
N_records 74.000000
μ 284.331526
σ 49.663561
y_min 225.783565
Q1 249.974238
274.986044
Q3 313.849497
y_max 411.725806
Kurtosis 1.182565
Skewness 1.264723
Code
plt.figure(figsize=(14, 5))
plt.plot(df_mensual, color='blue')
plt.title('Monthly Time Series - Global Radiation', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Radiation (W/m²)', fontsize=18)
plt.xticks(fontsize=16) 
plt.yticks(fontsize=16)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/Monthly_GlobalRadiation_TimeSeries.png", dpi=300)
plt.show()

Code
descomposicion = seasonal_decompose(df_mensual, model='additive', period=12)
descomposicion.plot()
plt.suptitle('Decomposition of Global Radiation (Monthly Cycle)', fontsize=16)
plt.tight_layout()
plt.show()

Code
df_diario = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('D').mean()
df_diario = df_diario.to_frame(name='RADIACAO_GLOBAL_IMPUTADA')
df_diario['Mes'] = df_diario.index.month
agrupado_mes_calendario = df_diario.groupby('Mes')['RADIACAO_GLOBAL_IMPUTADA']
resumen_mensual_simple = pd.DataFrame({
    'N_records': agrupado_mes_calendario.count(),
    'μ': agrupado_mes_calendario.mean(),
    'σ': agrupado_mes_calendario.std(),
    'y_min': agrupado_mes_calendario.min(),
    'Q1': agrupado_mes_calendario.quantile(0.25),
    'x̄': agrupado_mes_calendario.median(),
    'Q3': agrupado_mes_calendario.quantile(0.75),
    'y_max': agrupado_mes_calendario.max(),
    'Kurtosis': agrupado_mes_calendario.apply(pd.Series.kurtosis),
    'Skewness': agrupado_mes_calendario.apply(pd.Series.skew)
})
resumen_mensual_simple.index = pd.to_datetime(resumen_mensual_simple.index, format='%m').month_name()
resumen_mensual_simple
N_records μ σ y_min Q1 Q3 y_max Kurtosis Skewness
Mes
January 217 407.477151 163.626147 68.500000 283.104167 383.895833 516.208333 799.916667 -0.371655 0.164949
February 198 317.083649 133.032529 53.875000 235.833333 304.166667 392.291667 659.645833 0.033322 0.372261
March 186 234.346550 127.571404 3.875000 154.541667 225.500000 298.250000 610.666667 1.327689 0.917040
April 180 237.857407 141.355477 0.583333 140.453125 232.500000 306.776042 596.875000 0.105675 0.537886
May 186 257.869848 113.049078 69.791667 160.177083 244.722222 350.583333 518.916667 -0.768204 0.174336
June 180 230.266204 111.890502 0.000000 147.541667 226.770833 324.916667 430.125000 -0.636302 -0.359935
July 186 252.023746 137.279431 27.916667 168.791667 217.583333 357.541667 630.291667 0.256444 0.710706
August 186 277.110215 101.562718 61.750000 210.166667 274.187500 344.500000 494.375000 -0.488741 -0.054911
September 180 274.390278 129.997583 40.041667 181.416667 258.791667 339.291667 579.208333 -0.158738 0.444753
October 186 321.683580 149.988278 60.333333 185.083333 332.604167 430.125000 668.291667 -0.582618 0.048882
November 180 291.265625 137.199636 32.208333 196.562500 267.375000 364.041667 687.604167 0.801704 0.766444
December 186 284.581317 139.366372 28.833333 194.666667 253.104167 382.250000 743.041667 0.707201 0.739721
Code
promedio_por_mes = df_INMET.groupby(df_INMET.index.month)['RADIACAO_GLOBAL_IMPUTADA'].mean()
promedio_por_mes.index = pd.to_datetime(promedio_por_mes.index, format='%m').month_name()
promedio_por_mes = promedio_por_mes.reset_index()
promedio_por_mes.columns = ['Month', 'Average Radiation']
orden_meses = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
promedio_por_mes = promedio_por_mes.set_index('Month').loc[orden_meses].reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(data=promedio_por_mes, x='Month', y='Average Radiation', marker='o')
plt.title('Average Global Radiation by Month of the Year', fontsize=18)
plt.xlabel('Month of the Year', fontsize=14)
plt.ylabel('Global Radiation (Imputed)', fontsize=14)
plt.xticks(fontsize=13) 
plt.yticks(fontsize=13)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/avg_radiation_by_month.png", dpi=300)
plt.show()

Code
df_INMET['Month'] = df_INMET.index.month
df_INMET['Month'] = pd.to_datetime(df_INMET['Month'], format='%m').dt.month_name()
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']

os.makedirs("figures", exist_ok=True)
plt.figure(figsize=(14, 6))
sns.violinplot(
    data=df_INMET,
    x='Month',
    y='RADIACAO_GLOBAL_IMPUTADA',
    order=month_order,
    inner='box'  
)
plt.title('Distribution of Global Radiation by Month', fontsize=18)
plt.xlabel('Month', fontsize=16)
plt.ylabel('Global Radiation', fontsize=16)
plt.xticks(fontsize=14) 
plt.yticks(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/violinplot_radiation_by_month.png", dpi=300)
plt.show()

Code
df_INMET['Month'] = pd.Categorical(df_INMET['Month'], categories=month_order, ordered=True)

fig = px.violin(
    df_INMET,
    x='Month',
    y='RADIACAO_GLOBAL_IMPUTADA',
    category_orders={'Month': month_order},
    box=True,
    points=False,  
    title='Distribution of Imputed Global Radiation by Month'
)

fig.update_layout(
    title_font_size=18,
    xaxis_title='Month',
    yaxis_title='Global Radiation (Imputed)',
    xaxis_tickangle=45,
    font=dict(size=12)
)
fig.show()
Code
serie_mensual = df_INMET['RADIACAO_GLOBAL_IMPUTADA'].resample('M').mean()
plt.figure(figsize=(5, 4))
autocorrelation_plot(serie_mensual)
plt.title("Autocorrelation - Global Radiation (Monthly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/autocorrelation_monthly_series.png", dpi=300)
plt.show()

Code
plt.figure(figsize=(5, 4))
plot_pacf(serie_mensual, lags=30, method='ywm')
plt.title("PACF - Global Radiation (Monthly Series)", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig("figures/pacf_monthly_series.png", dpi=300)
plt.show()
<Figure size 500x400 with 0 Axes>